Data Mining Final Project

Author

RAVLEEN KAUR CHADHA, MRIDULA KALAISELVAN, YASH SHARMA

if(!require(pacman)) 
  install.packages("pacman") 

devtools::install_github("tidyverse/dsbox") 
pacman::p_load(tidyverse, 
               scales, 
               devtools,
               here,
               class, 
               plotly,
               dplyr,
               caret,
               BiocManager,
               smotefamily,
               pROC, 
               mclust, 
               factoextra, 
               cluster, 
               dbscan,
               arules,
               arulesViz,
               )
# Reading the dataset
hepatitis <- read_csv("HepatitisCdata.csv")
# Handling the NA values by replacing them by the median of each column
hepatitis <- hepatitis %>%
  mutate(across(where(is.numeric), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))

# Verifying that all NA values have been handled
colSums(is.na(hepatitis))
    ...1 Category      Age      Sex      ALB      ALP      ALT      AST 
       0        0        0        0        0        0        0        0 
     BIL      CHE     CHOL     CREA      GGT     PROT 
       0        0        0        0        0        0 
write.csv(hepatitis, "HepatitisC_Cleaned.csv", row.names = FALSE)
# EDA for the dataset

# Defining a colorblind-friendly palette

color_palette <- c("#440154FF", "#3B528BFF", "#21908CFF", "#5DC863FF", "#FDE725FF")

# Creating an enhanced interactive scatterplot

p <- plot_ly(
  data = hepatitis,
  x = ~AST, 
  y = ~ALT, 
  type = 'scatter', 
  mode = 'markers',
  color = ~as.factor(Category),
  colors = color_palette,
  marker = list(
    size = 8,
    opacity = 0.8
  ),
  text = ~paste(
    "Category:", case_when(
      Category == 0 ~ "Blood Donor",
      Category == 1 ~ "Suspect Blood Donor",
      Category == 2 ~ "Hepatitis",
      Category == 3 ~ "Fibrosis",
      Category == 4 ~ "Cirrhosis",
      TRUE ~ as.character(Category)
    ),
    "<br>AST:", AST,
    "<br>ALT:", ALT,
    "<br>Age:", Age
  )
) %>%
  layout(
    title = list(
      text = "Interactive Scatterplot: AST vs ALT by Category",
      font = list(size = 18)
    ),
    xaxis = list(
      title = "AST (Aspartate Transaminase)",
      titlefont = list(size = 14),
      zeroline = FALSE,
      showgrid = FALSE
    ),
    yaxis = list(
      title = "ALT (Alanine Transaminase)",
      titlefont = list(size = 14),
      zeroline = FALSE,
      showgrid = FALSE
    ),
    legend = list(
      title = list(text = "Category"),
      font = list(size = 12),
      orientation = "v",
      x = 1.1,
      y = 0.5,
      xanchor = "left"
    ),
    plot_bgcolor = "#FFFFFF",
    paper_bgcolor = "#FFFFFF"
  )

# Updating legend labels after plot creation

p <- p %>% layout(
  legend = list(
    title = list(text = "Category"),
    font = list(size = 12),
    orientation = "v",
    x = 1.1,
    y = 0.5,
    xanchor = "left",
    traceorder = "normal",
    itemsizing = "constant",
    labels = list(
      "0" = "Blood Donor",
      "1" = "Suspect Blood Donor",
      "2" = "Hepatitis",
      "3" = "Fibrosis",
      "4" = "Cirrhosis"
    )
  )
)

# Displaying the plot
p

Classification Analysis for the dataset Hepatitis-C - Logistic Regression

# Install and load required libraries
if (!requireNamespace("caret", quietly = TRUE)) {
  install.packages("caret")
}
if (!requireNamespace("themis", quietly = TRUE)) {
  install.packages("themis")
}
if (!requireNamespace("recipes", quietly = TRUE)) {
  install.packages("recipes")
}
if (!requireNamespace("pROC", quietly = TRUE)) {
  install.packages("pROC")
}

library(caret)
library(themis)
library(recipes)
library(pROC)

# Load the dataset
hepatitis <- read.csv("HepatitisC_Cleaned.csv")

# Convert 'Category' into a binary variable (e.g., "Blood Donor" vs. others)
hepatitis <- hepatitis %>%
  mutate(BinaryCategory = ifelse(Category == "0=Blood Donor", 1, 0))

# Encode 'Sex' as a numeric variable (e.g., "m" -> 1, "f" -> 0)
hepatitis <- hepatitis %>%
  mutate(Sex = ifelse(Sex == "m", 1, 0))

# Exclude unnecessary columns
hepatitis <- hepatitis %>%
  select(-...1, -Category)  # Remove index and original 'Category'

# Ensure the target variable is a factor
hepatitis$BinaryCategory <- as.factor(hepatitis$BinaryCategory)

# Split the dataset into training and testing sets
set.seed(123)  # For reproducibility
train_index <- createDataPartition(hepatitis$BinaryCategory, p = 0.8, list = FALSE)
train_data <- hepatitis[train_index, ]
test_data <- hepatitis[-train_index, ]

# Create a recipe for SMOTE
smote_recipe <- recipe(BinaryCategory ~ ., data = train_data) %>%
  step_smote(BinaryCategory, over_ratio = 1)  # Balance classes to a 1:1 ratio

# Prepare the SMOTE dataset
smote_data <- prep(smote_recipe) %>%
  bake(new_data = NULL)

# Check the class distribution after SMOTE
print("Class distribution after applying SMOTE:")
[1] "Class distribution after applying SMOTE:"
print(table(smote_data$BinaryCategory))

  0   1 
427 427 
# Train logistic regression on the SMOTE-balanced dataset
smote_logistic_model <- glm(BinaryCategory ~ ., data = smote_data, family = "binomial")

# Summarize the model
print("Logistic Regression Model Summary:")
[1] "Logistic Regression Model Summary:"
print(summary(smote_logistic_model))

Call:
glm(formula = BinaryCategory ~ ., family = "binomial", data = smote_data)

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept)  6.246494   3.005821   2.078 0.037697 *  
Age          0.055334   0.019529   2.833 0.004605 ** 
Sex          1.277596   0.462124   2.765 0.005699 ** 
ALB          0.170787   0.051974   3.286 0.001016 ** 
ALP          0.091808   0.011753   7.812 5.65e-15 ***
ALT          0.033121   0.007777   4.259 2.06e-05 ***
AST         -0.163989   0.021844  -7.507 6.04e-14 ***
BIL         -0.084453   0.025373  -3.328 0.000873 ***
CHE         -0.239305   0.104772  -2.284 0.022368 *  
CHOL         0.646587   0.210365   3.074 0.002115 ** 
CREA        -0.024328   0.004495  -5.412 6.22e-08 ***
GGT         -0.038332   0.006404  -5.986 2.15e-09 ***
PROT        -0.182912   0.040763  -4.487 7.22e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1183.90  on 853  degrees of freedom
Residual deviance:  251.74  on 841  degrees of freedom
AIC: 277.74

Number of Fisher Scoring iterations: 9
# Predict on the test set
smote_predictions <- predict(smote_logistic_model, newdata = test_data, type = "response")

# Convert probabilities to class labels (threshold = 0.5)
smote_predicted_classes <- ifelse(smote_predictions > 0.5, 1, 0)

# Evaluate the model
smote_confusion_matrix <- confusionMatrix(as.factor(smote_predicted_classes), as.factor(test_data$BinaryCategory))
print("Confusion Matrix after applying SMOTE:")
[1] "Confusion Matrix after applying SMOTE:"
print(smote_confusion_matrix)
Confusion Matrix and Statistics

          Reference
Prediction  0  1
         0 14  7
         1  2 99
                                          
               Accuracy : 0.9262          
                 95% CI : (0.8646, 0.9657)
    No Information Rate : 0.8689          
    P-Value [Acc > NIR] : 0.03364         
                                          
                  Kappa : 0.7142          
                                          
 Mcnemar's Test P-Value : 0.18242         
                                          
            Sensitivity : 0.8750          
            Specificity : 0.9340          
         Pos Pred Value : 0.6667          
         Neg Pred Value : 0.9802          
             Prevalence : 0.1311          
         Detection Rate : 0.1148          
   Detection Prevalence : 0.1721          
      Balanced Accuracy : 0.9045          
                                          
       'Positive' Class : 0               
                                          
# Plot ROC curve for the SMOTE model
smote_roc_curve <- roc(as.numeric(test_data$BinaryCategory), smote_predictions)
plot(smote_roc_curve, col = "red", main = "ROC Curve for Logistic Regression (SMOTE)")

Interpretation of the classification - Logistic Regression

Understanding important terms before proceeding…

What is SMOTE? Tried to keep it as easy as possible…

Imagine you’re teaching a class where there are 90 boys and only 10 girls, and you’re trying to predict something about them (e.g., who will pass an exam). Since there are so many more boys than girls, most of your predictions might focus on boys, and you’d struggle to give fair attention to the girls. This is called class imbalance.

SMOTE is like a clever trick to “even the playing field.” It helps balance the boys and girls (or majority and minority groups in your data) by creating new, synthetic data points for the smaller group (girls, in this case) instead of just copying the existing ones.

How Does SMOTE Work? (Super Simple Example) Original Situation:

You have 10 girls (minority group). Instead of just copying the same girls, SMOTE creates “new girls” based on the ones you already have. These aren’t exact copies but are slightly adjusted versions that look realistic.

How It Creates New Data:

Imagine you know two girls, Alice and Bella, who both scored well on the last test. SMOTE “imagines” a new girl, “Cathy,” whose score is somewhere between Alice’s and Bella’s scores. Cathy is not real but synthetic and helps give the smaller group more representation. Result After SMOTE:

Now, instead of 90 boys and 10 girls, you might have 90 boys and 50 girls (original 10 plus 40 synthetic girls). This balanced data gives your model a fair chance to learn about both groups.

Sensitivity: How well you catch actual threats (e.g., detecting weapons). High sensitivity means you don’t miss any weapons.

Specificity: How well you avoid stopping harmless people (e.g., avoiding false alarms for innocent passengers). High specificity means you don’t wrongly suspect harmless people.

What Does This Visualization Show? This is a ROC Curve (Receiver Operating Characteristic Curve). It’s a graph that helps us understand how well our model is doing at separating two groups:

Blood Donors (Positive Class: 1) Non-Donors (Negative Class: 0) The red curve represents the performance of the model. It shows how good the model is at balancing two things:

Sensitivity (True Positives): How many “Blood Donors” the model correctly identifies. Specificity (True Negatives): How many “Non-Donors” the model correctly identifies. The diagonal gray line is a “coin toss” or random guess. If the red curve were close to this line, the model would be as good as random guessing.

What Question Does This Visualization Answer? “How well can this model distinguish between Blood Donors and Non-Donors?” It shows how accurately the model predicts who is a donor and who is not, across all possible thresholds (cut-off points for deciding whether someone is classified as a donor).

How to Interpret This Curve in Simple Terms The red curve is very close to the top-left corner.

Top-left corner means perfect performance: it catches all donors without falsely labeling “Non-Donors” as donors. Our model does an excellent job of separating the two groups. The steep rise near the beginning tells us:

The model identifies most “Blood Donors” (Sensitivity is high) with very few mistakes early on. High specificity and balanced accuracy:

The flat section at the top shows the model continues to correctly classify “Non-Donors” even as it adjusts thresholds. Overall, this curve shows the model is doing much better than guessing (red curve far above the diagonal line).

Why Is This Important?

This visualization helps answer:

“How reliable is this model?” The red curve being far from the gray line means the model is reliable.

“Does the model handle Blood Donors (minority class) well?” Yes, because the curve rises steeply, showing it catches “Blood Donors” early and accurately.

“Should we trust the model’s predictions?”

Yes, because the curve indicates that the model balances errors well between identifying donors and non-donors.

This graph tells us:

The model can distinguish between donors and non-donors accurately. It catches most donors (87.5%) while making very few mistakes. The model is better than random guessing and does a good job even with imbalanced data, thanks to SMOTE.

library(ggplot2)
test_data$Predicted_Probabilities <- smote_predictions
ggplot(test_data, aes(x = Predicted_Probabilities, fill = BinaryCategory)) +
  geom_density(alpha = 0.5) +
  labs(title = "Probability Distribution for Logistic Regression",
       x = "Predicted Probability",
       y = "Density",
       fill = "Actual Class") +
  theme_minimal()

coef_df <- as.data.frame(summary(smote_logistic_model)$coefficients)
coef_df <- coef_df[-1, ]  # Exclude intercept
coef_df$Feature <- rownames(coef_df)
ggplot(coef_df, aes(x = reorder(Feature, abs(Estimate)), y = Estimate)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Feature Importance in Logistic Regression",
       x = "Feature",
       y = "Coefficient Value") +
  theme_minimal()

ggplot(smote_data, aes(x = ALT, y = AST, color = BinaryCategory)) +
  geom_point() +
  stat_smooth(method = "glm", method.args = list(family = "binomial"), se = FALSE) +
  labs(title = "Decision Boundary of Logistic Regression",
       x = "ALT",
       y = "AST") +
  theme_minimal()

GMM Clustering

# Step 1: Load the dataset
# Assuming the data has been preprocessed (scaled, PCA applied, and features weighted)
hepatitis_data <- read.csv("HepatitisC_Cleaned.csv")

# Select numeric columns for clustering
numeric_data <- hepatitis_data[sapply(hepatitis_data, is.numeric)]
scaled_data <- scale(numeric_data)

# Perform PCA for dimensionality reduction
pca <- prcomp(scaled_data, center = TRUE, scale. = TRUE)
pca_data <- pca$x[, 1:2]  # Retain the first two principal components

# Step 2: Apply Gaussian Mixture Models (GMM)
set.seed(123)  # Ensure reproducibility
gmm_model <- Mclust(pca_data)

# Optimal number of clusters
cat("Optimal Number of Clusters (GMM):", gmm_model$G, "\n")
Optimal Number of Clusters (GMM): 2 
# Step 3: Evaluate Clustering Quality
# Silhouette Score
silhouette_scores <- silhouette(gmm_model$classification, dist(pca_data))
avg_silhouette <- mean(silhouette_scores[, 3])
cat("Average Silhouette Score (GMM):", round(avg_silhouette, 2), "\n")
Average Silhouette Score (GMM): 0.62 
# Adjusted Rand Index (ARI)
if ("Category" %in% colnames(hepatitis_data)) {
  actual_labels <- as.numeric(factor(hepatitis_data$Category))  # Replace 'Category' with your actual label column
  ari <- adjustedRandIndex(actual_labels, gmm_model$classification)
  cat("Adjusted Rand Index (ARI):", round(ari, 2), "\n")
} else {
  cat("No actual labels provided for ARI calculation.\n")
}
Adjusted Rand Index (ARI): 0.63 
# Step 5: Add Cluster Assignments to Dataset
hepatitis_data$Cluster <- as.factor(gmm_model$classification)

# Step 6: Analyze Clusters
# Summary statistics for each cluster
cluster_summary <- hepatitis_data %>%
  group_by(Cluster) %>%
  summarise(across(where(is.numeric), mean, na.rm = TRUE))

# Display cluster summary
print(cluster_summary)
# A tibble: 2 × 13
  Cluster  ...1   Age   ALB   ALP   ALT   AST   BIL   CHE  CHOL  CREA   GGT
  <fct>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1        281.  46.6  42.4  66.5  26.1  28.0  8.79  8.46  5.46  78.7  29.2
2 2        545.  54.6  34.3  84.0  49.1  95.4 34.7   5.82  4.54 105.  132. 
# ℹ 1 more variable: PROT <dbl>
# Enhanced GMM Clustering Visualization with Labels
ggplot(as.data.frame(pca_data), aes(x = PC1, y = PC2, color = as.factor(gmm_model$classification))) +
  geom_point(size = 3, alpha = 0.7) +  # Data points
  stat_ellipse(aes(fill = as.factor(gmm_model$classification)), alpha = 0.2, geom = "polygon") +  # Ellipses
  scale_color_manual(values = c("#4CAF50", "#FF5722"), name = "Cluster",
                     labels = c("Cluster 1: Healthy/Low Risk", "Cluster 2: At-Risk/Diseased")) +
  scale_fill_manual(values = c("#4CAF50", "#FF5722"), name = "Cluster",
                    labels = c("Cluster 1: Healthy/Low Risk", "Cluster 2: At-Risk/Diseased")) +
  labs(
    title = "Gaussian Mixture Model Clustering",
    subtitle = "Principal Components and Identified Groups",
    x = "Principal Component 1 (Explains 33.3% Variance)",
    y = "Principal Component 2 (Explains 20.5% Variance)"
  ) +
  theme_minimal(base_size = 15) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    legend.position = "right",
    legend.title = element_text(face = "bold"),
    legend.text = element_text(size = 10), 
    axis.title.x = element_text(size = 9), 
    axis.title.y = element_text(size = 9)
  )

# Scale the data
scaled_data <- scale(pca_data)

# k-NN Distance Plot
kNNdist <- kNNdistplot(scaled_data, k = 5)  # k = minPts (adjust based on data)
abline(h = 1, col = "red", lty = 2)  # Adjust the threshold manually

# Apply DBSCAN with eps = 1.0 and minPts = 5 (rule of thumb: dimensions + 1)
dbscan_model <- dbscan(scaled_data, eps = 1, minPts = 5)

# Print cluster assignments
cat("Cluster Assignments:\n")
Cluster Assignments:
table(dbscan_model$cluster)

  0   1   2 
 13 598   4 
# Calculate the percentage of noise
noise_percentage <- sum(dbscan_model$cluster == 0) / nrow(scaled_data) * 100
cat("Noise Percentage:", round(noise_percentage, 2), "%\n")
Noise Percentage: 2.11 %
# Compute silhouette scores
silhouette_scores <- silhouette(dbscan_model$cluster, dist(scaled_data))
avg_silhouette <- mean(silhouette_scores[, 3])

# Print average silhouette score
cat("Average Silhouette Score (DBSCAN):", round(avg_silhouette, 2), "\n")
Average Silhouette Score (DBSCAN): 0.65 
cat("Cluster Sizes:\n")
Cluster Sizes:
table(dbscan_model$cluster)

  0   1   2 
 13 598   4 
# Visualize DBSCAN clusters
ggplot(as.data.frame(pca_data), aes(x = PC1, y = PC2, color = as.factor(dbscan_model$cluster))) +
  geom_point(size = 3, alpha = 0.7) +
  scale_color_manual(
    values = c("#999999", "#E41A1C", "#377EB8", "#4DAF4A", "#FF7F00"),
    name = "Cluster",
    labels = c("Noise", "Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4")
  ) +
  labs(
    title = "DBSCAN Clustering Results with eps = 1.0",
    x = "Principal Component 1",
    y = "Principal Component 2",
    caption = "Noise is represented by Cluster 0"
  ) +
  theme_minimal(base_size = 15) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5),
    legend.position = "right",
    legend.title = element_text(face = "bold"),
    legend.text = element_text(size = 12)
  )

cat("DBSCAN Results Summary:\n")
DBSCAN Results Summary:
cat("Number of Clusters:", length(unique(dbscan_model$cluster)) - 1, "\n")  # Exclude noise cluster
Number of Clusters: 2 
cat("Noise Percentage:", round(noise_percentage, 2), "%\n")
Noise Percentage: 2.11 %
cat("Average Silhouette Score:", round(avg_silhouette, 2), "\n")
Average Silhouette Score: 0.65 
# Install and load required libraries
if (!requireNamespace("e1071", 
                      quietly = TRUE)) {
  install.packages("e1071")
}
if (!requireNamespace("caret", 
                      quietly = TRUE)) {
  install.packages("caret")
}
if (!requireNamespace("dplyr",
                      quietly = TRUE)) {
  install.packages("dplyr")
}
library(e1071)
library(caret)
library(dplyr)

# Load the dataset
hepatitis <- read.csv("HepatitisC_Cleaned.csv")

# Preprocessing
# Encode 'Sex' as a factor
hepatitis$Sex <- as.factor(hepatitis$Sex)

# Convert 'Category' to a factor for multi-class classification
hepatitis$Category <- as.factor(hepatitis$Category)

# Split the data into training and testing sets
set.seed(123)  # For reproducibility
trainIndex <- createDataPartition(hepatitis$Category, p = 0.8, list = FALSE)
trainData <- hepatitis[trainIndex, ]
testData <- hepatitis[-trainIndex, ]

# Identify numeric features
numeric_features <- names(which(sapply(trainData, is.numeric)))

# Scale numeric features
scaled_train <- trainData
scaled_test <- testData

for (col in numeric_features) {
  # Get scaling parameters from the training set
  col_mean <- mean(trainData[[col]], na.rm = TRUE)
  col_sd <- sd(trainData[[col]], na.rm = TRUE)
  
  # Scale training and test data
  scaled_train[[col]] <- (trainData[[col]] - col_mean) / col_sd
  scaled_test[[col]] <- (testData[[col]] - col_mean) / col_sd
}

# Train the SVM model with a radial basis function kernel
svm_model <- svm(Category ~ ., data = scaled_train, kernel = "radial", cost = 1, gamma = 0.1)

# Predict on the test set
svm_predictions <- predict(svm_model, newdata = scaled_test)

# Evaluate the model
conf_matrix <- confusionMatrix(svm_predictions, scaled_test$Category)
print("Confusion Matrix for SVM:")
[1] "Confusion Matrix for SVM:"
print(conf_matrix)
Confusion Matrix and Statistics

                        Reference
Prediction               0=Blood Donor 0s=suspect Blood Donor 1=Hepatitis
  0=Blood Donor                    106                      0           0
  0s=suspect Blood Donor             0                      0           0
  1=Hepatitis                        0                      0           3
  2=Fibrosis                         0                      0           0
  3=Cirrhosis                        0                      1           1
                        Reference
Prediction               2=Fibrosis 3=Cirrhosis
  0=Blood Donor                   0           0
  0s=suspect Blood Donor          0           0
  1=Hepatitis                     0           0
  2=Fibrosis                      3           1
  3=Cirrhosis                     1           5

Overall Statistics
                                          
               Accuracy : 0.9669          
                 95% CI : (0.9175, 0.9909)
    No Information Rate : 0.876           
    P-Value [Acc > NIR] : 0.0004865       
                                          
                  Kappa : 0.8546          
                                          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: 0=Blood Donor Class: 0s=suspect Blood Donor
Sensitivity                         1.000                      0.000000
Specificity                         1.000                      1.000000
Pos Pred Value                      1.000                           NaN
Neg Pred Value                      1.000                      0.991736
Prevalence                          0.876                      0.008264
Detection Rate                      0.876                      0.000000
Detection Prevalence                0.876                      0.000000
Balanced Accuracy                   1.000                      0.500000
                     Class: 1=Hepatitis Class: 2=Fibrosis Class: 3=Cirrhosis
Sensitivity                     0.75000           0.75000            0.83333
Specificity                     1.00000           0.99145            0.97391
Pos Pred Value                  1.00000           0.75000            0.62500
Neg Pred Value                  0.99153           0.99145            0.99115
Prevalence                      0.03306           0.03306            0.04959
Detection Rate                  0.02479           0.02479            0.04132
Detection Prevalence            0.02479           0.03306            0.06612
Balanced Accuracy               0.87500           0.87073            0.90362
# Install and load the GGally package if not already installed
if (!requireNamespace("GGally", quietly = TRUE)) {
  install.packages("GGally")
}
library(GGally)

# Specify the features to include in the plot
selected_features <- c("Age", "ALB", "ALP", "ALT", "AST", "BIL")

# Filter the dataset for specific categories
filtered_data <- scaled_train %>%
  filter(Category %in% c("1=Hepatitis", "2=Fibrosis", "3=Cirrhosis"))  # Assuming "1" = Hepatitis, "2" = Fibrosis, "3" = Cirrhosis

# Parallel coordinates plot for selected features and categories
ggparcoord(data = filtered_data,
           columns = which(names(filtered_data) %in% selected_features),
           groupColumn = "Category",
           scale = "std", alphaLines = 0.5) +
  labs(title = "Parallel Coordinates Plot for Hepatitis, Fibrosis, and Cirrhosis",
       x = "Features", y = "Scaled Values") +
  theme_minimal(base_size = 14)

Association rule using Apriori Algorithm

# 1. Data Preparation
hepatitis_categorized <- hepatitis %>%
  mutate(
    Age_Group = case_when(
      Age < 35 ~ "Young",
      Age < 50 ~ "Middle",
      Age < 65 ~ "Senior",
      TRUE ~ "Elderly"
    ),
    ALT_Level = case_when(
      ALT < 40 ~ "Normal",
      ALT < 80 ~ "Mild_Elevation",
      ALT < 200 ~ "Moderate_Elevation",
      TRUE ~ "Severe_Elevation"
    ),
    AST_Level = case_when(
      AST < 40 ~ "Normal",
      AST < 80 ~ "Mild_Elevation",
      AST < 200 ~ "Moderate_Elevation",
      TRUE ~ "Severe_Elevation"
    ),
    Disease_Status = case_when(
      Category == 0 ~ "Blood_Donor",
      Category == 1 ~ "Suspect",
      Category == 2 ~ "Hepatitis",
      Category == 3 ~ "Fibrosis",
      Category == 4 ~ "Cirrhosis"
    )
  )

# 2. Create transactions
hepatitis_trans <-
  as(data.frame(select(hepatitis_categorized, 
    Age_Group, ALT_Level, AST_Level, Disease_Status)), 
    "transactions")

# 3. Apply Apriori algorithm
rules <- apriori(hepatitis_trans,
    parameter = list(
        support = 0.03,
        confidence = 0.5,
        minlen = 2,
        maxlen = 4
    ))
Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen
        0.5    0.1    1 none FALSE            TRUE       5    0.03      2
 maxlen target  ext
      4  rules TRUE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 18 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[12 item(s), 615 transaction(s)] done [0.00s].
sorting and recoding items ... [9 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [25 rule(s)] done [0.00s].
creating S4 object  ... done [0.00s].
# 4. Process rules
rules_pruned <- 
  rules[!is.redundant(rules)]
rules_sorted <- 
  sort(rules_pruned, by = "lift", decreasing = TRUE)

# 5. Create rules dataframe
rules_df <- data.frame(
    lhs = labels(lhs(rules_sorted)),
    rhs = labels(rhs(rules_sorted)),
    support = quality(rules_sorted)$support,
    confidence = quality(rules_sorted)$confidence,
    lift = quality(rules_sorted)$lift
)

# 6. Visualization
# Scatter Plot
scatter_plot <- plot_ly(rules_df, 
        x = ~support,
        y = ~confidence,
        color = ~lift,
        type = "scatter",
        mode = "markers",
        marker = list(size = 10),
        text = ~paste("Rule:", lhs, "=>", rhs)) %>%
    layout(
        title = "Apriori Algorithm",
        xaxis = list(title = "Support"),
        yaxis = list(title = "Confidence"),
        colorbar = list(title = "Lift")
    )



# 7. Print Quality Assessment
cat("\nAssociation Rule Analysis Summary:")

Association Rule Analysis Summary:
cat("\nNumber of Rules:", nrow(rules_df))

Number of Rules: 21
cat("\nAverage Confidence (Accuracy):", round(mean(rules_df$confidence), 4))

Average Confidence (Accuracy): 0.7977
cat("\nAverage Lift:", round(mean(rules_df$lift), 4))

Average Lift: 1.0168
cat("\nAverage Support:", round(mean(rules_df$support), 4))

Average Support: 0.2092
# Print top 10 rules
cat("\n\nTop 10 Rules by Lift:\n")


Top 10 Rules by Lift:
head(rules_df[order(-rules_df$lift), 
              c("lhs", "rhs", "support", "confidence", "lift")], 10)
                                           lhs                rhs    support
1  {ALT_Level=Mild_Elevation,AST_Level=Normal} {Age_Group=Middle} 0.05691057
2                   {ALT_Level=Mild_Elevation} {Age_Group=Middle} 0.07154472
3           {Age_Group=Young,AST_Level=Normal} {ALT_Level=Normal} 0.08780488
4                   {AST_Level=Mild_Elevation} {Age_Group=Middle} 0.05040650
5         {Age_Group=Elderly,AST_Level=Normal} {ALT_Level=Normal} 0.03577236
6           {Age_Group=Young,ALT_Level=Normal} {AST_Level=Normal} 0.08780488
7          {Age_Group=Middle,ALT_Level=Normal} {AST_Level=Normal} 0.35934959
8          {Age_Group=Senior,AST_Level=Normal} {ALT_Level=Normal} 0.26829268
9                           {ALT_Level=Normal} {AST_Level=Normal} 0.75121951
10                          {AST_Level=Normal} {ALT_Level=Normal} 0.75121951
   confidence     lift
1   0.6034483 1.258036
2   0.5789474 1.206958
3   0.9473684 1.120445
4   0.5254237 1.095375
5   0.9166667 1.084135
6   0.9152542 1.082464
7   0.9132231 1.080062
8   0.9016393 1.066362
9   0.8884615 1.050777
10  0.8884615 1.050777
# Display plots
scatter_plot

CHARM Algorithm

# Load required libraries
library(tidyverse)
library(arules)
library(arulesViz)
library(plotly)

# 1. Data Preprocessing
hepatitis_categorized <- hepatitis %>%
  mutate(
    Age_Group = case_when(
      Age < 35 ~ "Young",
      Age < 50 ~ "Middle",
      Age < 65 ~ "Senior",
      TRUE ~ "Elderly"
    ),
    ALT_Level = case_when(
      ALT < 40 ~ "ALT_Normal",
      ALT < 80 ~ "ALT_Mild_Elevation",
      ALT < 200 ~ "ALT_Moderate_Elevation",
      TRUE ~ "ALT_Severe_Elevation"
    ),
    AST_Level = case_when(
      AST < 40 ~ " AST_Normal",
      AST < 80 ~ " AST_Mild_Elevation",
      AST < 200 ~ " AST_Moderate_Elevation",
      TRUE ~ " AST_Severe_Elevation"
    ),
    Disease_Status = case_when(
      Category == 0 ~ "Blood_Donor",
      Category == 1 ~ "Suspect",
      Category == 2 ~ "Hepatitis",
      Category == 3 ~ "Fibrosis",
      Category == 4 ~ "Cirrhosis"
    )
  )

# 2. CHARM Implementation
charm_association_rules <- function(data, min_support = 0.05, min_confidence = 0.5) {
    # Convert to transaction format
    trans_matrix <- as.matrix(data)
    n_trans <- nrow(trans_matrix)
    
    # Find unique items
    items <- unique(as.character(trans_matrix))
    cat("Initial items:", length(items), "\n")
    
    # Create vertical database (tid-lists)
    tid_lists <- list()
    frequent_items <- character()
    
    # Find frequent single items
    for(item in items) {
        tids <- which(apply(trans_matrix, 1, function(x) item %in% x))
        support <- length(tids)/n_trans
        if(support >= min_support) {
            tid_lists[[item]] <- tids
            frequent_items <- c(frequent_items, item)
        }
    }
    
    cat("Frequent items:", length(frequent_items), "\n")
    
    # Store closed itemsets
    closed_itemsets <- list()
    
    # Find closed itemsets
    find_closed_itemsets <- function(prefix = character(), prefix_tids = NULL, 
                                   remaining_items = frequent_items) {
        for(i in seq_along(remaining_items)) {
            item <- remaining_items[i]
            new_tids <- if(is.null(prefix_tids)) tid_lists[[item]]
                       else intersect(prefix_tids, tid_lists[[item]])
            
            support <- length(new_tids)/n_trans
            
            if(support >= min_support) {
                new_itemset <- c(prefix, item)
                
                # Check if closed
                is_closed <- TRUE
                for(existing in names(closed_itemsets)) {
                    if(all(new_itemset %in% strsplit(existing, ",")[[1]]) && 
                       closed_itemsets[[existing]]$support == support) {
                        is_closed <- FALSE
                        break
                    }
                }
                
                if(is_closed) {
                    closed_itemsets[[paste(new_itemset, collapse=",")]] <<- list(
                        items = new_itemset,
                        support = support,
                        tids = new_tids
                    )
                }
                
                # Recursive call with remaining items
                if(i < length(remaining_items)) {
                    find_closed_itemsets(new_itemset, new_tids, 
                                       remaining_items[(i+1):length(remaining_items)])
                }
            }
        }
    }
    
    # Generate closed itemsets
    cat("Generating closed itemsets...\n")
    find_closed_itemsets()
    cat("Found", length(closed_itemsets), "closed itemsets\n")
    
    # Generate rules from closed itemsets
    rules <- data.frame(
        lhs = character(),
        rhs = character(),
        support = numeric(),
        confidence = numeric(),
        lift = numeric(),
        stringsAsFactors = FALSE
    )
    
    # Generate rules
    cat("Generating association rules...\n")
    for(itemset_key in names(closed_itemsets)) {
        itemset <- closed_itemsets[[itemset_key]]
        if(length(itemset$items) >= 2) {
            for(i in 1:(length(itemset$items)-1)) {
                combs <- combn(itemset$items, i, simplify = FALSE)
                for(lhs in combs) {
                    rhs <- setdiff(itemset$items, lhs)
                    
                    # Calculate confidence
                    lhs_support <- max(sapply(names(closed_itemsets), function(key) {
                        if(all(lhs %in% closed_itemsets[[key]]$items))
                            return(closed_itemsets[[key]]$support)
                        return(0)
                    }))
                    
                    if(lhs_support > 0) {
                        confidence <- itemset$support / lhs_support
                        
                        if(confidence >= min_confidence) {
                            # Calculate lift
                            rhs_support <- max(sapply(names(closed_itemsets), function(key) {
                                if(all(rhs %in% closed_itemsets[[key]]$items))
                                    return(closed_itemsets[[key]]$support)
                                return(0)
                            }))
                            
                            lift <- confidence / rhs_support
                            
                            rules[nrow(rules) + 1,] <- list(
                                lhs = paste(lhs, collapse=", "),
                                rhs = paste(rhs, collapse=", "),
                                support = itemset$support,
                                confidence = confidence,
                                lift = lift
                            )
                        }
                    }
                }
            }
        }
    }
    
    return(rules)
}

# 3. Apply CHARM to dataset
selected_data <- select(hepatitis_categorized, 
                       Age_Group, ALT_Level, AST_Level, Disease_Status)
rules <- charm_association_rules(selected_data, min_support = 0.05, min_confidence = 0.5)
Initial items: 13 
Frequent items: 10 
Generating closed itemsets...
Found 24 closed itemsets
Generating association rules...
# 4. Visualize results
if(nrow(rules) > 0) {
    # Scatter plot
    scatter_plot <- plot_ly(rules,
        x = ~confidence,
        y = ~lift,
        color = ~support,
        type = "scatter",
        mode = "markers",
        text = ~paste("Rule:", lhs, "=>", rhs,
                     "<br>Support:", round(support, 4),
                     "<br>Confidence:", round(confidence, 4),
                     "<br>Lift:", round(lift, 4))
    ) %>%
        layout(
            title = "CHARM Algotrithm",
            xaxis = list(title = "Confidence"),
            yaxis = list(title = "Lift"),
            colorbar = list(title = "Support")
        )
    
    # Print statistics
    cat("\nAssociation Rule Analysis Results:\n")
    cat("Total rules found:", nrow(rules), "\n")
    cat("Average confidence:", round(mean(rules$confidence), 4), "\n")
    cat("Average lift:", round(mean(rules$lift), 4), "\n")
    cat("Average support:", round(mean(rules$support), 4), "\n")
    
    # Print top rules
    cat("\nTop 10 Rules by Lift:\n")
    print(head(rules[order(-rules$lift), ], 10))
    
    # Save results
    write.csv(rules, "charm_rules.csv", row.names = FALSE)
} else {
    cat("No rules found. Try adjusting the minimum support or confidence thresholds.\n")
}

Association Rule Analysis Results:
Total rules found: 23 
Average confidence: 0.8 
Average lift: 1.0279 
Average support: 0.2454 

Top 10 Rules by Lift:
                               lhs                     rhs    support
12 ALT_Mild_Elevation,  AST_Normal                  Middle 0.05691057
10              ALT_Mild_Elevation                  Middle 0.07154472
4               Young,  AST_Normal              ALT_Normal 0.08780488
14              AST_Mild_Elevation                  Middle 0.05040650
3                Young, ALT_Normal              AST_Normal 0.08780488
8               Middle, ALT_Normal              AST_Normal 0.35934959
18             Senior,  AST_Normal              ALT_Normal 0.26829268
2                            Young ALT_Normal,  AST_Normal 0.08780488
20                      ALT_Normal              AST_Normal 0.75121951
21                      AST_Normal              ALT_Normal 0.75121951
   confidence     lift
12  0.6034483 1.258036
10  0.5789474 1.206958
4   0.9473684 1.120445
14  0.5254237 1.095375
3   0.9152542 1.082464
8   0.9132231 1.080062
18  0.9016393 1.066362
2   0.7941176 1.057105
20  0.8884615 1.050777
21  0.8884615 1.050777
    # Display plot
  scatter_plot